catalist <- read_csv("data/main_catalist_file.csv")


# Add pct R variable
catalist <- catalist %>%
 rowwise() %>% 
 mutate(pct_R = sum(R.y.020 + R.y.2040 + R.y.4060 + R.y.6080 + 
                                              R.y.80100 + R.y.100120 + R.y.120140 + 
                                              R.y.140160 + R.y.160180 + R.y.180200 + R.unk)/
                      sum(R.y.020 + R.y.2040 + R.y.4060 + R.y.6080 + R.y.80100 + 
                            R.y.100120 + R.y.120140 + R.y.140160 + R.y.160180 + 
                            R.y.180200 + R.y.020+D.y.2040 + D.y.4060 + D.y.6080 + 
                            D.y.80100 + D.y.100120 + D.y.120140 + D.y.140160 + D.y.160180 + D.y.180200 + 
                            D.unk + R.unk + O.y.020 + O.y.2040 + O.y.4060 + O.y.6080 +
                            O.y.6080 + O.y.80100 + O.y.100120 + O.y.120140 + O.y.140160 + 
                            O.y.160180 + O.y.180200 + O.unk, 
                      total_R = sum(R.y.020 + R.y.2040 +R.y.4060 + R.y.6080 + 
                                      R.y.80100 + R.y.100120 + R.y.120140 + 
                                      R.y.140160 + R.y.160180 + R.y.180200)))

# Merge with region 

key <- data.frame(stabbr = state.abb, stregion = state.region)

catalist <- left_join(catalist, key, by = 'stabbr')

catalist_sub <- catalist %>% select(pct_R, sh_black, sh_medinc, stregion) #, total_R)

catalist_sub$south <- ifelse(catalist$stregion == "South", 1, 0)

# Private regression
formula <- formula(pct_R ~ sh_medinc + sh_black + south)
private_reg <- lm(formula, data = catalist_sub)

# Add DP noise 
n <- nrow(catalist_sub)

reDraw <- function(){
  catalist_dp <- catalist_sub[, -4] 

  catalist_dp$pct_R <- catalist_sub$pct_R + rnorm(n, 0, sd(catalist_sub$pct_R))
  catalist_dp$sh_black <- catalist_sub$sh_black + rnorm(n, 0, 1.5*sd(catalist_sub$sh_black))
  catalist_dp$sh_medinc <- catalist_sub$sh_medinc + rnorm(n, 0, 0.5*sd(catalist_sub$sh_medinc))

  S_vec <- c(sd(catalist_sub$pct_R), 
                  1.5*sd(catalist_sub$sh_black), 
              0.5*sd(catalist_sub$sh_medinc),  0)#, 
            # 0.01*sd(catalist_sub$total_R))we
  
  catalist_dp <- as.data.frame(rbind(S_vec, catalist_dp))
  
  # Run naive and BC regression
  naive_reg <- lm(formula, data = catalist_dp[-1, ])
  bc_reg <- PrivacyUnbiased::lmdp(formula, data = catalist_dp)
  
  c(coef(naive_reg), bc_reg$beta_tilde, 
    sqrt(diag(bc_reg$b_vcov)), sqrt(diag(bc_reg$beta_tilde_vcov)))

}

set.seed(02138)

print('running sims')

application_sims <- t(replicate(500, reDraw()))

mean_coef <- apply(application_sims, 2, mean)

sum_sim <- as.data.frame(rbind(as.numeric(coef(private_reg)), mean_coef[1:4], mean_coef[5:8]))

point_estimates <- as.data.frame(cbind(estimator = c('private_coef', 'naive_coef', 'bc_coef'), sum_sim))


color_vec <- c('orange', 'orange3', 'dodgerblue3', 'navyblue', 'indianred3')


application_plot <- ggplot() +  
  geom_density(aes(x = application_sims[, 7]), col = color_vec[4], adjust = 5) + xlim(c(-0.5, 0.01)) + 
  stat_density(aes(x = application_sims[,3]), col = color_vec[2], geom="line") + 
  theme_bw() +
  theme(panel.grid.major = element_blank(), 
  panel.grid.minor = element_blank()) + 
  annotate("text", x = -0.38, y = 11.5, label = 'Bias corrected', colour = color_vec[4]) + 
  annotate("text", x = -0.12, y = 35, label = 'Naive', colour = color_vec[2]) + 
  annotate("text", x = -0.27, y = 30, label = 'Private', colour = color_vec[3]) + 
  labs(x = 'Coefficient estimate') + 
  geom_hline(yintercept=0, colour="white", size= 0.9) + 
  geom_vline(aes(xintercept = coef(private_reg)['sh_black']), linetype = 'dashed', col = color_vec[3]) + 
  geom_point(aes(x = 0, y = 50), col = 'white') +
  scale_y_continuous(expand = c(0, 0), limits = c(0, NA)) 
  

ggsave(application_plot, file = 'plots/figure3.pdf', width = 6, height = 4)
  
